library(tidyverse)
library(readxl)

### This code reads in the raw data that is available online and processes it to create 
### a single dataset of Russian servicemembers based on the leaked data.
### Note that due to PI, some components of the actual data cleaning process are withheld;
### This code is solely provided for reference purposes.

# Read in source files, de-duplicate each of the files

set.seed(as.numeric(Sys.time()) %/% 60)

path_to_file_1 <- ""
path_to_file_2 <- ""



src_1 <- readxl::read_excel(path_to_file_1)  %>%
  mutate(
    issuer = case_when(
      `Регион (штат, федер. земля, пр` != "" ~ `Регион (штат, федер. земля, пр`,
      `Регион (штат, федер. земля, пр` == "" & `Город` != "" ~ `Город`,
      T ~ as.character(`Кем выдан`)
    )
  ) |> 
  dplyr::select(c("Воинское звание", "Фамилия, Имя, Отчество", "Воинская часть", "issuer")) %>% 
    mutate(
      dob = NA
    ) %>%
  setNames(c("rank", "name", "unit", "issuer", "dob")) |> 
  mutate(
    source = path_to_file_1
  ) |> distinct()

src_2 <- readxl::read_excel(path_to_file_2) %>%
  dplyr::select(c("Наименование воинского звания", "Табельный номер", "Раздел персонала", "Кем выдан", "ДатаРожд")) %>%
  setNames(c("rank", "name", "unit", "issuer", "dob")) |> 
  mutate(
    source = path_to_file_2
  ) |> distinct()


# Read in coded geography information. A good portion of locations in the source data
# had to be coded manually, since the data was not recorded in a consistent manner
# that would have allowed for automatic coding via regex. Since this approach involves a substantial deanonymization of the dataset
# we cannot include the manually coded data in the repository.

locations_path <- ""

geographic_locations <- readxl::read_excel(locations_path, sheet = "manually_coded_geog") |> distinct()
locations_personnel <- readxl::read_excel(locations_path, sheet = "manually_coded_geog_dob") |> distinct()



# Put all together:
# Bind both sources
# warnings <- capture_warnings({
bind_rows(src_1, src_2) %>%
  mutate(
    issuer = case_when(
      is.na(issuer) ~ unit,
      T ~ as.character(issuer)
    )
  ) |> 
  mutate(
# Design regex codes that recodes addresses into administrative units or cities
    oblast = issuer,
    test = case_when(
      str_detect(oblast, "(?i)Краснодар(?=[ ,.ае]|$)") ~ "Г. Краснодар",
      str_detect(oblast, "(?i)Красноярск(?=[ ,.ае]|$)") ~ "Г. Красноярск",
      str_detect(oblast, "(?i)Воронеж(?=[ ,.ае]|$)") ~ "Г. Воронеж",
      str_detect(oblast, "(?i)Екатеринбург(?=[ ,.ае]|$)|екат") ~ "Г. Екатеринбург",
      str_detect(oblast, "(?i)Казан(?=[ ,.ьи]|$)") ~ "Г. Казань",
      str_detect(oblast, "(?i)Москв(?=[ ,.аы]|$)") ~ "Г. Москва",
      str_detect(oblast, "(?i)Ниж.*Нов|Н.*Новг") ~ "Г. Нижний Новгород",
      str_detect(oblast, "(?i)Новосибирск(?=[ ,.ае]|$)|НОВОСИБРСКА") ~ "Г. Новосибирск",
      str_detect(oblast, "(?i)Омск(?=[ ,.ае]|$)") ~ "Г. Омск",
      str_detect(oblast, "(?i)Перм(?=[ ,.ьи]|$)") ~ "Г. Пермь",
      str_detect(oblast, "(?i)Волгоград(?=[ ,.ае]|$)") ~ "Г. Волгоград",
      str_detect(oblast, "(?i)Самар(?=[ ,.аы]|$)") ~ "Г. Самара",
      str_detect(oblast, "(?i)Ростов.*на-Дону") ~ "Г. Ростов-на-Дону",
      str_detect(oblast, "(?i)Самара") ~ "Г. Самара",
      str_detect(oblast, "(?i)Санкт-Петербург|петербург|спб") ~ "Г. Санкт-Петербург",
      str_detect(oblast, "(?i)Севастополь") ~ "Г. Севастополь",
      str_detect(oblast, "(?i)Уфа") ~ "Г. Уфа",
      str_detect(oblast, "(?i)Челябинск") ~ "Г. Челябинск",
      str_detect(oblast, '(?i)Москва|г. Москва|г.Москве|гор. Москва|"ОВД "" Капотня"""|г. Москве|г. Москвa|г. Москвы') ~ "г. Москва",
      str_detect(oblast, "(?i)Санкт-Петербург|Санкт-Петербурга|спб|санкт|Санск-Петербурга|С-Пб") ~ "Г. Санкт-Петербург",
      str_detect(oblast, "(?i)КЧР|Карачаево-Черкесск|карач") ~ "Республика Карачаево-Черкесия",
      str_detect(oblast, "(?i)Нальчик") ~ "Республика Кабардино-Балкария",
      str_detect(oblast, "(?i)Алтайскго края|алт.*кр|Алиайскому кр|Алайскоум кр|АЛТАЙКОМУ КР") ~ "Алтайский край",
      str_detect(oblast, "(?i)Абурской обл|аму.*об|благове|белогорск|АМУРСОЙ ОБЛ|АМРСКОИ ОБЛ|.*амуре") ~ "Амурская область",
      str_detect(oblast, "(?i)Саха (Якутия)|Якути| Саха ") ~ "Республика Саха (Якутия)",
      str_detect(oblast, "(?i)Бурятия|Бурятии|Буртия|Республике Бурятия|У-у|Улан-|Улан |У.*де|у.*э|Бурятия|УЛАН-УДЭ|УЛАН УДЭ") ~ "Республика Бурятия",
      str_detect(oblast, "(?i)Татарстан") ~ "Республика Татарстан",
      str_detect(oblast, "(?i)Дагестан|Дагистан|Дагесиан|махач|ДАССР") ~ "Республика Дагестан",
      str_detect(oblast, "(?i)Тыва") ~ "Республика Тыва",
      str_detect(oblast, "(?i)Башкортостан|Башк|Башкир") ~ "Республика Башкортостан",
      str_detect(oblast, "(?i)Чит.*об|Чиинской обл.*|читы| читы?|\\..*чита|агинск|АБАЙКАЛЬСКОМУ КР|ЗАБАКЙКАЛЬС|Заб кр|Заб.кр|Заюайкальском|Збайкльско|Забайкаль") ~ "Забайкальский край",
      str_detect(oblast, "(?i)АМРУСК|(?i)АМУРСК|Амуск") ~ "Амурская область",
      str_detect(oblast, "(?i)БЕЛГОРОДСКАЯ|БЕЛГОРОД|белогород") ~ "Белгородская область",
      str_detect(oblast, "(?i)АРХАНГЕЛЬСКОЙ") ~ "Архангельская область",
      str_detect(oblast, "(?i)Московская|МОСКОВСКОЙ ОБЛ|Ногинского р-на МО|Моск.*обл") ~ "Московкая область",
      str_detect(oblast, "(?i)Удму") ~ "Удмуртская Республика",
      str_detect(oblast, "(?i)Чуваш") ~ "Республика Чувашия",
      str_detect(oblast, "(?i)Кемеров") ~ "Кемеровская область",
      str_detect(oblast, "(?i)Лененградск") ~ "Лененградская область",
      str_detect(oblast, "(?i)Иркутск") ~ "Иркутская область",
      str_detect(oblast, "(?i)Еврейск|ЕАО|БИРОБИДЖА") ~ "Еврейская АО",
      str_detect(oblast, "по НСО") ~ "Новосибирская область",
      str_detect(oblast, "(?i)Камч.*об|Усть-Камчатский район|КАМЧАТСК") ~ "Камчатский край",
      str_detect(oblast, "(?i)приморского края|приморског.*|п\\-к|владив|артем|артём|п.*кр|уссури|находк|пк$|влад.*к|приморский кр|прим.кр|приморского кр|приморскому") ~ "Приморский край",
      str_detect(oblast, "(?i)прморскому кр|приморском кр|приморкому кр|пиморскому кр|яковлевским ровд приморского края|шкотовским ровд прим. края|хасанским ровд пк|черниговским ровд пк|приорскому кр|примосркого края|примосркго края|приморкого края|прим кр|россии по пк|приморскому к|чугуевским ровд пк") ~ "Приморский край",
      str_detect(oblast, "(?i)Северная Осетия|владикавк") ~ "Северная Осетия-Алания",
      str_detect(oblast, "(?i)Хабароскому кр|Хабаровкому кр|Хабароского края|Хаб края|Хабаровск|2 отделением милиции УВД Индустриального района") ~ "Хабаровский край",
      str_detect(oblast, "(?i)Алтайск.кр|барнаул") ~ "Алтайский край",
      str_detect(oblast, "(?i)Амурс") ~ "Амурская область",
      str_detect(oblast, "(?i)Арханге") ~ "Архангельская область",
      str_detect(oblast, "(?i)Астрахан") ~ "Астраханская область",
      str_detect(oblast, "(?i)Байконур") ~ "Байконур",
      str_detect(oblast, "(?i)Белгород") ~ "Белгородская область",
      str_detect(oblast, "(?i)Брян") ~ "Брянская область",
      str_detect(oblast, "(?i)Владимир") ~ "Владимирская область",
      str_detect(oblast, "(?i)Волгоградск(?i)|Вол.*обл") ~ "Волгоградская область",
      str_detect(oblast, "(?i)Вологод|Вологд") ~ "Вологодская область",
      str_detect(oblast, "(?i)Воронежск.*об") ~ "Воронежская область",
      str_detect(oblast, "(?i)Еврейск") ~ "Еврейская автономная область",
      str_detect(oblast, "(?i)Забайкал.*кр") ~ "Забайкальский край",
      str_detect(oblast, "(?i)Заграница") ~ "Заграница",
      str_detect(oblast, "(?i)Ивановск.*об|иваново") ~ "Ивановская область",
      str_detect(oblast, "(?i)Иркутск") ~ "Иркутская область",
      str_detect(oblast, "(?i)Калининг.*об|калининград") ~ "Калининградская область",
      str_detect(oblast, "(?i)Калининс.") ~ "Тверская область",
      str_detect(oblast, "(?i)Калужск.*об|Калуг") ~ "Калужская область",
      str_detect(oblast, "(?i)Камч.*кр") ~ "Камчатский край",
      str_detect(oblast, "(?i)Кемер.*об|Кемеров") ~ "Кемеровская область",
      str_detect(oblast, "(?i)Кир.*об|ГОР.КИРОВА|города кирова") ~ "Кировская область",
      str_detect(oblast, "(?i)Коми-Пермяцкий АО") ~ "Коми-Пермяцкий АО",
      str_detect(oblast, "(?i)Комст.") ~ "Комстромская область",
      str_detect(oblast, "(?i)Костро.*об|костром") ~ "Костромская область",
      str_detect(oblast, "(?i)Краснод.*кр") ~ "Краснодарский край",
      str_detect(oblast, "(?i)Краснояр.*кр|Красноярским РОВД|ГОР. КРАСНОЯРСК-66|ГОР. КРАСНОЯРСК-45") ~ "Красноярский край",
      str_detect(oblast, "(?i)Курга.") ~ "Курганская область",
      str_detect(oblast, "(?i)Курск.*об|курск") ~ "Курская область",
      str_detect(oblast, "(?i)Лен.*об") ~ "Ленинградская область",
      str_detect(oblast, "(?i)Липец.*об|липецк") ~ "Липецкая область",
      str_detect(oblast, "(?i)Магада.*об|магадан") ~ "Магаданская область",
      str_detect(oblast, "(?i)Моско.*об") ~ "Московская область",
      str_detect(oblast, "(?i)Мурман.*об|мурманск") ~ "Мурманская область",
      str_detect(oblast, "(?i)Ненецкий АО") ~ "Ненецкий АО",
      str_detect(oblast, "(?i)Нижег.*об|Горьк.*обл") ~ "Нижегородская область",
      str_detect(oblast, "(?i)Новгор.*об") ~ "Новгородская область",
      str_detect(oblast, "(?i)Новос.*об") ~ "Новосибирская область",
      str_detect(oblast, "(?i)Омск.*об") ~ "Омская область",
      str_detect(oblast, "(?i)Оренб.*об|оренбург") ~ "Оренбургская область",
      str_detect(oblast, "(?i)Орлов.*об|ОРЛА|орел") ~ "Орловская область",
      str_detect(oblast, "(?i)Пенз.*об|пенз") ~ "Пензенская область",
      str_detect(oblast, "(?i)Пермс.*кр|Пермс.*об") ~ "Пермский край",
      str_detect(oblast, "(?i)Прим.*кр") ~ "Приморский край",
      str_detect(oblast, "(?i)Пск.*об|псков") ~ "Псковская область",
      str_detect(oblast, "(?i)Адыг.*") ~ "Республика Адыгея",
      str_detect(oblast, "(?i)Алтай") ~ "Республика Алтай",
      str_detect(oblast, "(?i)Алта") ~ "Республика Алтай",
      str_detect(oblast, "(?i)Башкор.*|башкир") ~ "Республика Башкортостан",
      str_detect(oblast, "(?i)Бурятия|буря") ~ "Республика Бурятия",
      str_detect(oblast, "(?i)Дагес.*") ~ "Республика Дагестан",
      str_detect(oblast, "(?i)Ингушет.*|назрань") ~ "Республика Ингушетия",
      str_detect(oblast, "(?i)Кабард.*|кбр|кбасср") ~ "Кабардино-Балкарская республика",
      str_detect(oblast, "(?i)Калмы.*") ~ "Республика Калмыкия",
      str_detect(oblast, "(?i)Карача.*|кчр") ~ "Карачаево-Черкесская республика",
      str_detect(oblast, "(?i)Карелия|карел|петрозав") ~ "Республика Карелия",
      str_detect(oblast, "(?i)Коми") ~ "Республика Коми",
      str_detect(oblast, "(?i)Крым") ~ "Республика Крым",
      str_detect(oblast, "(?i)Марий Эл|марий|ошкар") ~ "Республика Марий Эл",
      str_detect(oblast, "(?i)Морд.*|саранс") ~ "Республика Мордовия",
      str_detect(oblast, "(?i)Саха (Якутия)|якут") ~ "Республика Саха (Якутия)",
      str_detect(oblast, "(?i)Сев.*Осет|моздок") ~ "Республика Северная Осетия — Алания",
      str_detect(oblast, "(?i)алани") ~ "Республика Северная Осетия — Алания",
      str_detect(oblast, "(?i)Татарстан|татарс|ТАССР") ~ "Республика Татарстан",
      str_detect(oblast, "(?i)Тыва|тыве|тувин") ~ "Республика Тыва",
      str_detect(oblast, "(?i)Хакас") ~ "Республика Хакасия",
      str_detect(oblast, "(?i)Чуваш") ~ "Чувашская республика",
      str_detect(oblast, "(?i)Рост.*об") ~ "Ростовская область",
      str_detect(oblast, "(?i)Рязан") ~ "Рязанская область",
      str_detect(oblast, "(?i)Самар.*об|куй.*об") ~ "Самарская область",
      str_detect(oblast, "(?i)Сарат.*об|Сарат") ~ "Саратовская область",
      str_detect(oblast, "(?i)Сахал.*об|сахал|ю.*сах") ~ "Сахалинская область",
      str_detect(oblast, "(?i)Сверд.*об") ~ "Свердловская область",
      str_detect(oblast, "(?i)Смолен.") ~ "Смоленская область",
      str_detect(oblast, "(?i)Ставро") ~ "Ставропольский край",
      str_detect(oblast, "(?i)Тамбов.*об|Тамб") ~ "Тамбовская область",
      str_detect(oblast, "(?i)Твер.") ~ "Тверская область",
      str_detect(oblast, "(?i)Том.*об") ~ "Томская область",
      str_detect(oblast, "(?i)Тул.*об|тулы|Тула") ~ "Тульская область",
      str_detect(oblast, "(?i)Тюмен") ~ "Тюменская область",
      str_detect(oblast, "(?i)Удмурт") ~ "Удмуртская республика",
      str_detect(oblast, "(?i)Ульянов.*об|ульяновск") ~ "Ульяновская область",
      str_detect(oblast, "(?i)ФМС") ~ "ФМС",
      str_detect(oblast, "(?i)Фед.*м") ~ "Федеральная миграционная служба",
      str_detect(oblast, "(?i)Хаб.*кр|хабаров") ~ "Хабаровский край",
      str_detect(oblast, "(?i)Хант|югра") ~ "Ханты-Мансийский АО — Югра",
      str_detect(oblast, "(?i)Челяб*.обл") ~ "Челябинская область",
      str_detect(oblast, "(?i)Чечен|грозн|ЧИАССР") ~ "Чеченская республика",
      str_detect(oblast, "(?i)Чукот|анад") ~ "Чукотский АО",
      str_detect(oblast, "(?i)Ямал") ~ "Ямало-Ненецкий АО",
      str_detect(oblast, "(?i)Яросл.*об|ярослав|г.Ярославля") ~ "Ярославская область",
      str_detect(oblast, "Воронежская") ~  "Воронежская область",
      str_detect(oblast, "Ивановская") ~  "Ивановская область",
      str_detect(oblast, "Калужская") ~  "Калужская область",
      str_detect(oblast, "Краснодарский|Ейск") ~  "Краснодарский край",
      str_detect(oblast, "Красноярский") ~  "Красноярский край",
      str_detect(oblast, "Ленинградская") ~  "Ленинградская область",
      str_detect(oblast, "Нижегородская") ~  "Нижегородская область",
      str_detect(oblast, "Новгородская") ~  "Новгородская область",
      str_detect(oblast, "Орловская") ~  "Орловская область",
      str_detect(oblast, "Пензенская") ~  "Пензенская область",
      str_detect(oblast, "Пермский") ~  "Пермский край",
      str_detect(oblast, "Приморский") ~  "Приморский область",
      str_detect(oblast, "Ростовская") ~  "Ростовская область",
      str_detect(oblast, "Самарская") ~  "Самарская область",
      str_detect(oblast, "Свердловская") ~  "Свердловская область",
      str_detect(oblast, "Ставропольский") ~  "Ставропольский край",
      str_detect(oblast, "Тульская") ~  "Тульская область",
      T ~ NA_character_
    ),
    test = str_squish(test),
    test = str_trim(test),
  ) |> 
# Turn unit numbers into numeric codes
    mutate(
    unit = str_replace_all(unit, "Войсковая часть |В/ч |войсковая часть |в/ч |В/ЧАСТЬ ", "")
) |>
  type_convert() |> 
  # Where values cannot be coded automatically, we merge with geographic data that were 
  # coded manually. 
  left_join(geographic_locations |> dplyr::select(oblast_source, Code) |> type_convert() |> distinct(), by = c("oblast" = "oblast_source")) |> 
  left_join(locations_personnel |> dplyr::select(issuer, oblast, dob, Code) |> type_convert(), by = c(
    "test" = "oblast",
    "issuer" = "issuer",
    "dob" = "dob"
  )) |> 
  # Generate unique ID
  mutate(
    id = paste0(rank, "-", name, "-", oblast, "-", issuer, "-", dob)
  ) |> 
  add_count(id, name = "n_matches") -> interim_file_with_duplicates



# Remove duplicates by first name/dob. Considering that some records have
# been entered twice, we need to remove duplicates by first name/dob and
# place of registration. Duplicates arise because place of registration is
# entered in a free text field, and there are many ways to write the same
# place, e.g., "Краснодарский край" and "Краснодарский кр." are the same
# place, but will be entered differently in the database for the same person.

interim_file_with_duplicates |> 
  filter(n_matches > 1) |> 
  arrange(id, desc(n_matches)) |> 
  mutate(row_id = row_number()) %>%  # Create a row identifier
  filter(row_id %% 2 == 1) %>%       # Keep rows where row_id is odd
  dplyr::select(-row_id) -> de_duplicated_duplicates

# Retain only non-duplicate records
interim_file_with_duplicates |> 
  filter(n_matches == 1) |>
# Append cleaned duplicates to the non-duplicates
  bind_rows(de_duplicated_duplicates) |> 
  dplyr::select(-id) |> 
  mutate(
    geography = case_when(
      Code.x != "" ~ Code.x,
      Code.x =="" & Code.y != "" ~ Code.y,
      Code.x =="" & Code.y == "" ~ test,
      is.na(Code.x) & !is.na(Code.y) ~ Code.y,
      !is.na(Code.x) & is.na(Code.y) ~ Code.x,
      is.na(Code.x) & is.na(Code.y) ~ test,
      T ~ NA_character_
    )
  ) |> 
  # filter(is.na(geography)) |> 
  distinct() -> ruaf_full_raw


# Row numbers should not match with the original source,
# since there were duplicates by first name/dob AND
# in the way how addresses were spelled (e.g. "г. Москва" vs "Москва")

ruaf_full_raw |> 
  dplyr::select(rank, name, unit, issuer, geography, source, dob) -> ruaf_full

nrow(ruaf_full) == src_1 |> distinct() |> nrow() + src_2 |> distinct() |> nrow()

ruaf_full |> 
  group_by(source) |> count()


## Anonymize processed data

library(digest)

# Reference: Gorecki, 2014: http://jangorecki.github.io/blog/2014-11-07/Data-Anonymization-in-R.html

anonymize <- function(x, algo="crc32"){
  unq_hashes <- vapply(unique(x), function(object) digest(object, algo=algo), FUN.VALUE="", USE.NAMES=TRUE)
  unname(unq_hashes[x])
}


### Merge with ancillary data

### Last names, based on the Memorial dataset
last_names_path_memorial <- ""

surnames <- read_csv(last_names_path_memorial) %>%
  mutate(
    ethnic_id = 1
  )

### Ranks, based on the Russian legislation

ranks <- read_excel("ancillary_data.xlsx", sheet = "ranks")

# Geography, that contains information on federal okrugs, milionnaires and national republics

geography <- read_excel("ancillary_data.xlsx", sheet = "geography")

# Merge with join keys. Since our initial dataset separates "Millionaire" cities from their
# oblasts, we are merging with key variables that group them with oblasts. This is done
# because census and economic data are available at the oblast level.

keys <- read_excel("ancillary_data.xlsx", sheet = "merge_codes")

# 
ancillary_data <- read_excel("ancillary_data.xlsx", sheet = "census_econ") |> 
  filter(
    # Since these two oblasts contain smaller autonomous okrugs, we remove them
    !Subject %in% c("Архангельская область", "Тюменская область")
) |> 
  distinct()

ruaf_full |>
  # Separate names into first, last and patronymic names
  separate(name, into = c("surname", "first", "patronym"), sep = " ") -> ruaf_surnames

ruaf_surnames |> 
  dplyr::select(surname, first) |> 
  distinct() |> 
  set_names(
    c(
      "last_name", "first_name"
    )
  )-> unique_names

unique_names |>
  mutate(
    across(
      everything(), ~ str_to_sentence(.x)
    ),
    across(
      everything(), ~ str_replace_all(.x, "\\-\\ ", "")
    )
  ) |> 
  # Write in a csv file that will be used for the prediction using the Bessudnov et. al. algorithm. Note that this file is not 
  # included in the replication file
  write_csv("ruaf_data.csv")

# Run the Bessudnov et. al. classifier and generate files with predictions and probabilities


# Categorical predictions
cat_pred <- read_csv("ruaf_pred.csv") |> 
  distinct()

# Probabilistic predictions
prob_pred <- read_csv("ruaf_pr_pred.csv") |> 
  distinct()

ruaf_surnames |> 
## Edit patronymics
  mutate(
    last_name = case_when(
    # Format last names so those could be merged with the Memorial adata
      str_detect(surname, "ва$|вa-") == T ~ str_replace(surname, "ва$|вa-", "в"),
      str_detect(surname, "на$|нa-") == T ~ str_replace(surname, "на$|нa-", "н"),
      str_detect(surname, "ая$") == T ~ str_replace(surname, "ая$", "ий"),
      T ~ as.character(surname)
    ),
    last_name = str_to_sentence(last_name),
    surname = str_to_sentence(surname),
    first = str_to_sentence(first),
    across(
      c(surname, first), ~ str_replace_all(.x, "\\-\\ ", "")
    ),
    sex = case_when(
    # Infer sex from patronymic names
      str_detect(patronym, "ч$|Ч$|лы$|ЛЫ$") == T ~ 1,
      str_detect(patronym, "а$|А$|зы$|ЗЫ$") == T ~ 2,
      str_detect(surname, "ва$|вa-|на$|НА$|ВА$") == T ~ 2,
      str_detect(surname, "в$|н$|Н$|В$") == T ~ 1,
      str_detect(first, "Раса|Гюльнара") == T ~ 2,
      T ~ 1
    )
  ) |> 
  ## Join the Memorial last name data
  left_join(surnames, by = c("last_name" = "surname")) %>% 
  ## join the bessudnov-trained data
  left_join(
    cat_pred, by = c(
      "surname" = "last_name",
      "first" = "first_name"
    )
  ) |> 
  left_join(
    prob_pred, by = c(
      "surname" = "last_name",
      "first" = "first_name"
    )
  ) |> 
  ## Join ranks data
  left_join(
    ranks, by = "rank"
  ) |>
  # Edit typos in geography
  mutate(
    geography = case_when(
      geography == "Байконур" ~ "Заграница",
      geography == "Владимирая область" ~ "Владимирская область",
      geography == "Сахалинская облать" ~ "Сахалинская область",
      geography == "Ханты-Мансийский АО — Югра" ~ "Ханты-Мансийский АО - Югра",
      geography == "Хабарсовский край" ~ "Хабаровский край",
      geography == "г. Москва" ~ "Г. Москва",
      geography == "г. Севастополь" ~ "Г. Севастополь",
      geography == "Кабардино-Балкарская Республика" ~ "Республика Кабардино-Балкария",
      geography == "Кабардино-Балкарская республика" ~ "Республика Кабардино-Балкария",
      geography == "Московская область" ~ "Московская область",
      geography == "Московкая область" ~ "Московская область",
      geography == "Орбенбургская область" ~ "Оренбургская область",
      geography == "Комстромская область" ~ "Костромская область",
      geography == "Приморсий край" ~ "Приморский край",
      geography == "Хабаровска краю" ~ "Хабаровский край",
      geography == "Рязаньская область" ~ "Рязанская область",
      geography == "Самарская обл" ~ "Самарская область",
      geography == "Ростоская область" ~ "Ростовская область",
      geography == "Республика Северная Осетия — Алания" ~ "Республика Северная Осетия - Алания",
      geography == "Северная Осетия-Алания" ~ "Республика Северная Осетия - Алания",
      geography == "Северная Оссетия - Алания" ~ "Республика Северная Осетия - Алания",
      geography == "Тверьская область" ~ "Тверская область",
      geography == "Тюменьская область" ~ "Тюменская область",
      geography == "Удмуртская республика" ~ "Удмуртская Республика",
      geography == "Ульяновскская область" ~ "Ульяновская область",
      geography == "Ребпулика Кабардино-Балкария" ~ "Республика Кабардино-Балкария",
      geography == "ФМС" ~ "Федеральная миграционная служба",
      geography == "г. Севостополь" ~ "Г. Севастополь",
      geography %in% c("Тамбовкая область", "Тамобвская область") ~ "Тамбовская область",
      T ~ as.character(geography)
    )) |> 
    left_join(geography, by = "geography") |>
  mutate(
    federal_district = case_when(
      geography == "Московская область" ~ "Central",
      geography == "Владимирская область" ~ "Central",
      geography == "Рязанская область" ~ "Central",
      geography == "Тамбовская область" ~ "Central",
      geography == "Тверская область" ~ "Central",
      geography == "Тюменская область" ~ "Ural",
      geography == "Самарская область" ~ "Volga",
      geography == "Удмуртская Республика" ~ "Volga",
      geography == "Ульяновская область" ~ "Volga",
      geography == "г. Севостополь" ~ "South",
      geography == "Приморский край" ~ "Far east",
      geography == "Хабаровский край" ~ "Far east",
      geography == "Республика Кабардино-Балкария" ~ "North Caucasus",
      geography == "Чечено-Ингушская АССР" ~ "North Caucasus",
      T ~ as.character(federal_district)
    ),
    sex = factor(sex, levels = c(1,2), labels = c("Male", "Female")),
    ethn_non_e_slavs = 1-ethn_1-ethn_2-ethn_11,
    # This is necessary due to the precision issues in calculating probabilities
    # saved probabilities contained decimals that were not properly saved in the
    # CSV file
    ethn_non_e_slavs = case_when(
      ethn_non_e_slavs < 0 ~ 0,
      T ~ as.double(ethn_non_e_slavs)
    ),
    ## Tatars, 3
    
    tatar_prob_95pct = case_when(
      ethn_3 >= 0.95 ~ 1,
      is.na(ethn_3) ~ 0,
      T ~ 2
    ),

    ## Other Caucasus nationalities 16
    
    cauc_prob_95pct = case_when(
      ethn_16 >= 0.95 ~ 1,
      is.na(ethn_16) ~ 0,
      T ~ 2
    ),
    
    rank_large_f = factor(rank_large, levels = (0:4), labels = c("Enlisted", "Sergeants", "Junior officers", "Senior officers", "General officers")),
    branch = factor(branch, levels=c(1:4), labels = c("Army", "Navy", "Medical", "Justice")),
    ethnic_id = case_when(
      ethnic_id == 1 ~ 1,
      T ~ 0
    ),
    millionaires = case_when(
      is.na(millionaires) ~ 0,
      T ~ 1,
      
    ),
    national_republics = case_when(
      is.na(national_republics) ~ 0,
      T ~ 1,
    ),
    rank_known = case_when(
      !is.na(rank_large_f) ~ 1,
      T ~ 0
    ),
    # Generate probability bins. Eventually, we will use 95% bins
    prob_99pct = case_when(
      ethn_non_e_slavs >= 0.99 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_95pct = case_when(
      ethn_non_e_slavs >= 0.95 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_90pct = case_when(
      ethn_non_e_slavs >= 0.9 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_80pct = case_when(
      ethn_non_e_slavs >= 0.8 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_70pct = case_when(
      ethn_non_e_slavs >= 0.7 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_60pct = case_when(
      ethn_non_e_slavs >= 0.6 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_50pct = case_when(
      ethn_non_e_slavs >= 0.5 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_1pct = case_when(
      ethn_non_e_slavs <= 0.01 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_5pct = case_when(
      ethn_non_e_slavs <= 0.05 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_10pct = case_when(
      ethn_non_e_slavs <= 0.1 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_20pct = case_when(
      ethn_non_e_slavs <= 0.2 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_30pct = case_when(
      ethn_non_e_slavs <= 0.3 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    prob_40pct = case_when(
      ethn_non_e_slavs <= 0.4 ~ 1,
      is.na(ethn_non_e_slavs) ~ 0,
      T ~ 2
    ),
    # Generate birth cohorts
    birth_cohorts = case_when(
      dob < as.Date("1975-01-01") ~ 1, # "1974 or earlier",
      dob >= as.Date("1975-01-01") & dob < as.Date("1981-01-01") ~ 2, # "1975-1980",
      dob >= as.Date("1981-01-01") & dob < as.Date("1985-01-01") ~ 3, # "1981-1984",
      dob >= as.Date("1985-01-01") & dob < as.Date("1991-01-01") ~ 4, # "1985-1990",
      dob >= as.Date("1991-01-01") & dob < as.Date("1995-01-01") ~ 5, # "1991-1994",
      dob >= as.Date("1995-01-01") ~ 6, # "1995 or later",
      is.na(dob) ~ 99 # "Birth year unknown"
    ),
    birth_cohorts = factor(
      birth_cohorts, levels = c(1, 2, 3, 4, 5, 6, 99), labels = c(
        "1974 or earlier", "1975-1980", "1981-1984", "1985-1990", "1991-1994", "1995 or later", "Birth year unknown"
      )
    ),
    prob_95pct = factor(
      prob_95pct, levels = c(0, 1, 2), labels = c("Not classified", "National minority", "Eastern Slav")
    ),
  ) |> 
  # Join keys and ancillary data
  left_join(keys, by = "geography") |> 
  left_join(ancillary_data, by = "Subject") |>
  dplyr::select(
    c(
      rank, surname, first, patronym, sex, birth_cohorts, geography, ethn_non_e_slavs, prob_95pct,
      ethn_3, ethn_16, tatar_prob_95pct, cauc_prob_95pct,
      branch, rank_num,
      rank_large, rank_large_f, rank_known, federal_district,
      national_republics, millionaires, quintile_group, category, nonresponse_prop, source, ethnos, agr_ethnos, Armenian_full:Yakut_agr
    )
  ) |>
  mutate(
    # Replace millionaires with NA if geography is unknown
    millionaires = case_when(
      is.na(geography) ~ NA_real_,
      T ~ as.double(millionaires)
    ),
    # Add Ufa and Kazan to national republics,
    national_republics = case_when(
      geography %in% c("Г. Уфа", "Г. Казань") ~ 1,
      is.na(geography) ~ NA_real_,
      T ~ as.double(national_republics)
    ),
    federal_district = case_when(
      federal_district == "Northeast" ~ "Northwest", # Fix an error
      T ~ as.character(federal_district)
    )
  ) %>%
  
  ## save interim data frame as an object
  assign("ruaf_full_not_anon", ., envir = .GlobalEnv) %>%
  # add a column sort to shuffle the data. The seed changes
  # depending on the timestamp on the computer
  add_column(sort = runif(nrow(ruaf_full))) |> 
  arrange(sort) |> 
  dplyr::select(-sort) |> 
  mutate(
    across(
      c(surname, first, patronym),
      ~ anonymize(.x)
    ),
  ) |> 
  labelled::set_variable_labels(
    rank = "Military rank",
    surname = "Surname (anonymized)",
    first = "First name (anonymized)",
    patronym = "Patronymic (anonymized)",
    sex = "Assigned sex (inferred from the patronymic)",
    birth_cohorts = "Five-year birth cohorts",
    geography = "Geographic location of registration",
    ethn_non_e_slavs = "Probability of being non-East Slavic last name (inferred from the Memorial dataset)",
    prob_95pct = "If the record is binned in the 95% probability bin",
    ethn_3 = "Probability of being Tatar last name (inferred from the Memorial dataset)",
    ethn_16 = "Probability of being other Caucasus nationalities, except Armenian, Azerbaijani, or Georgian (inferred from the Memorial dataset)",
    tatar_prob_95pct = "If the record is binned in the 95% probability bin of being classified as Tatar",
    cauc_prob_95pct = "If the record is binned in the 95% probability bin of being classified as other Caucasus nationalities",
    rank_num = "Military rank number, ordered from the lowest to the highest (numeric)",
    rank_large = "Larger groups of military ranks, ordered from the lowest to the highest (numeric)",
    rank_large_f = "Larger groups of military ranks, ordered from the lowest to the highest (factor)",
    rank_known = "Whether the rank is known or not",
    federal_district = "Federal district, inferred from the geographic location of registration",
    national_republics = "Whether the registration is in a 'national republic/region' or not",
    millionaires = "Whether the registration is in a 'millionaire' city or not",
    quintile_group = "Quintile group of the geographic region, as per the Gross Regional Product in 2019",
    category = "Category of the geographic region's national composition, as per all records in the 2020-2021 census",
    nonresponse_prop = "The share of nonresponse answers in the nationality question in the 2020-2021 census",
    source = "Whether the record comes from smaller or larger source dataset",
    ethnos = "Individual national group as per Bessudnov et al model",
    agr_ethnos = "Aggregated national group as per Bessudnov et al model",
    Armenian_full= "Probability of classified as Armenian as per Bessudnov et al model",
    Azerbaijani_full= "Probability of classified as Azerbaijani as per Bessudnov et al model",
    Bashkir_full= "Probability of classified as Bashkir as per Bessudnov et al model",
    Belarusian_full= "Probability of classified as Belarusian as per Bessudnov et al model",
    Buryat_full= "Probability of classified as Buryat as per Bessudnov et al model",
    Chechen_full= "Probability of classified as Chechen as per Bessudnov et al model",
    Dagestani_full= "Probability of classified as Dagestani as per Bessudnov et al model",
    Georgian_full= "Probability of classified as Georgian as per Bessudnov et al model",
    Ingush_full= "Probability of classified as Ingush as per Bessudnov et al model",
    Jewish_full= "Probability of classified as Jewish as per Bessudnov et al model",
    KabardinAdyghe_full= "Probability of classified as Kabardin/Adyghe as per Bessudnov et al model",
    Kalmyk_full= "Probability of classified as Kalmyk as per Bessudnov et al model",
    KarachayBalkar_full= "Probability of classified as Karachay-Balkar as per Bessudnov et al model",
    Kazakh_full= "Probability of classified as Kazakh as per Bessudnov et al model",
    Kyrgyz_full= "Probability of classified as Kyrgyz as per Bessudnov et al model",
    Moldovan_full= "Probability of classified as Moldovan as per Bessudnov et al model",
    Ossetian_full= "Probability of classified as Ossetian as per Bessudnov et al model",
    Russian_full= "Probability of classified as Russian as per Bessudnov et al model",
    Tajik_full= "Probability of classified as Tajik as per Bessudnov et al model",
    Tatar_full= "Probability of classified as Tatar as per Bessudnov et al model",
    Tuvan_full= "Probability of classified as Tuvan as per Bessudnov et al model",
    Ukrainian_full= "Probability of classified as Ukrainian as per Bessudnov et al model",
    Uzbek_full= "Probability of classified as Uzbek as per Bessudnov et al model",
    Yakut_full= "Probability of classified as Yakut as per Bessudnov et al model",
    Armenian_agr= "Probability of classified as Armenian as per Bessudnov et al model (aggregated)",
    Azerbaijani_agr = "Probability of classified as Azerbaijani as per Bessudnov et al model (aggregated)",
    BashTat_agr= "Probability of classified as Bashkir/Tatar as per Bessudnov et al model (aggregated)",
    BelRusUkr_agr= "Probability of classified as Belarusian/Russian/Ukrainian as per Bessudnov et al model (aggregated)",
    Buryat_agr= "Probability of classified as Buryat as per Bessudnov et al model (aggregated)",
    CheDagIng_agr= "Probability of classified as Chechen/Dagestani/Ingush as per Bessudnov et al model (aggregated)",
    Georgian_agr= "Probability of classified as Georgian as per Bessudnov et al model (aggregated)",
    Jewish_agr= "Probability of classified as Jewish as per Bessudnov et al model (aggregated)",
    KabAdKarBalOs_agr= "Probability of classified as Kabardin/Adyghe/Karachay-Balkar/Ossetian as per Bessudnov et al model (aggregated)",
    Kalmyk_agr= "Probability of classified as Kalmyk as per Bessudnov et al model (aggregated)",
    KazKyr_agr= "Probability of classified as Kazakh/Kyrgyz as per Bessudnov et al model (aggregated)",
    Moldovan_agr= "Probability of classified as Moldovan as per Bessudnov et al model (aggregated)",
    TajUzb_agr= "Probability of classified as Tajik/Uzbek as per Bessudnov et al model (aggregated)",
    Tuvan_agr= "Probability of classified as Tuvan as per Bessudnov et al model (aggregated)",
    Yakut_agr= "Probability of classified as Yakut as per Bessudnov et al model (aggregated)"
  ) -> ruaf_analysis_full


write_rds(ruaf_analysis_full, your file path.rds)
